#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import shuffle
%matplotlib inline
#load data
gsearch_jobs=pd.read_csv('D:\Sales Data analysis python\gsearch_jobs.csv')
#droping irrelevant column
gsearch_jobs.drop("Unnamed: 0",axis=1,inplace=True)
#show dataframe
gsearch_jobs.head(3)
| index | title | company_name | location | via | description | extensions | job_id | thumbnail | posted_at | ... | commute_time | salary_pay | salary_rate | salary_avg | salary_min | salary_max | salary_hourly | salary_yearly | salary_standardized | description_tokens | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Data Analyst | Robert Half | Oklahoma City, OK | via LinkedIn | Description\n\nRobert Half is looking for a pr... | ['24 hours ago', 'Contractor', 'Health insuran... | eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 24 hours ago | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ['go'] |
| 1 | 1 | Data Analyst | Apex Health Solutions | United States | via LinkedIn | Data Analyst Summary Apex Health Solutions is ... | ['21 hours ago', 'Full-time', 'Health insurance'] | eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 21 hours ago | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ['sql', 'power_bi', 'azure', 'excel'] |
| 2 | 2 | Marketing Data Analyst | Ledger Bennett | Anywhere | via LinkedIn | At Ledger Bennett, we strive to help our emplo... | ['21 hours ago', 'Work from home', 'Full-time'] | eyJqb2JfdGl0bGUiOiJNYXJrZXRpbmcgRGF0YSBBbmFseX... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 21 hours ago | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ['sql', 'tableau', 'excel'] |
3 rows × 26 columns
#checking Datatype of DataFrame gsearch_jobs and not null values present in each column
gsearch_jobs.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19495 entries, 0 to 19494 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 19495 non-null int64 1 title 19495 non-null object 2 company_name 19495 non-null object 3 location 19480 non-null object 4 via 19495 non-null object 5 description 19495 non-null object 6 extensions 19495 non-null object 7 job_id 19495 non-null object 8 thumbnail 9793 non-null object 9 posted_at 19495 non-null object 10 schedule_type 19374 non-null object 11 work_from_home 8743 non-null object 12 salary 3733 non-null object 13 search_term 19495 non-null object 14 date_time 19495 non-null object 15 search_location 19495 non-null object 16 commute_time 0 non-null float64 17 salary_pay 3733 non-null object 18 salary_rate 3733 non-null object 19 salary_avg 3733 non-null float64 20 salary_min 3507 non-null float64 21 salary_max 3507 non-null float64 22 salary_hourly 2190 non-null float64 23 salary_yearly 1537 non-null float64 24 salary_standardized 3733 non-null float64 25 description_tokens 19495 non-null object dtypes: float64(7), int64(1), object(18) memory usage: 3.9+ MB
#List features of gsearch_jobs and calculate the associated number of missing values per feature
gsjob_of_null_values = gsearch_jobs.isnull().sum()
for k, v in gsjob_of_null_values.items():
percentage = round((v * 100 / gsearch_jobs['index'].index.size),2)
print(k,", ",v, "(", percentage ,"%)")
index , 0 ( 0.0 %) title , 0 ( 0.0 %) company_name , 0 ( 0.0 %) location , 15 ( 0.08 %) via , 0 ( 0.0 %) description , 0 ( 0.0 %) extensions , 0 ( 0.0 %) job_id , 0 ( 0.0 %) thumbnail , 9702 ( 49.77 %) posted_at , 0 ( 0.0 %) schedule_type , 121 ( 0.62 %) work_from_home , 10752 ( 55.15 %) salary , 15762 ( 80.85 %) search_term , 0 ( 0.0 %) date_time , 0 ( 0.0 %) search_location , 0 ( 0.0 %) commute_time , 19495 ( 100.0 %) salary_pay , 15762 ( 80.85 %) salary_rate , 15762 ( 80.85 %) salary_avg , 15762 ( 80.85 %) salary_min , 15988 ( 82.01 %) salary_max , 15988 ( 82.01 %) salary_hourly , 17305 ( 88.77 %) salary_yearly , 17958 ( 92.12 %) salary_standardized , 15762 ( 80.85 %) description_tokens , 0 ( 0.0 %)
Feature location has 0.08 % missing value feature salaries has more than 80% missing value which is huge here we will fill blank salary value with average salaries according to respective job_type to get more accuracy in data
gsearch_jobs.head(2)
| index | title | company_name | location | via | description | extensions | job_id | thumbnail | posted_at | ... | commute_time | salary_pay | salary_rate | salary_avg | salary_min | salary_max | salary_hourly | salary_yearly | salary_standardized | description_tokens | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Data Analyst | Robert Half | Oklahoma City, OK | via LinkedIn | Description\n\nRobert Half is looking for a pr... | ['24 hours ago', 'Contractor', 'Health insuran... | eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 24 hours ago | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ['go'] |
| 1 | 1 | Data Analyst | Apex Health Solutions | United States | via LinkedIn | Data Analyst Summary Apex Health Solutions is ... | ['21 hours ago', 'Full-time', 'Health insurance'] | eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 21 hours ago | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ['sql', 'power_bi', 'azure', 'excel'] |
2 rows × 26 columns
#Converting salary_min,salary_max into per annum for monthly and hourly data as per salary rate
gsearch_jobs.loc[gsearch_jobs['salary_rate'] == 'a month', ['salary_min','salary_max']] *= 12
gsearch_jobs.loc[gsearch_jobs['salary_rate'] == 'an hour', ['salary_min','salary_max']] *= 2080
gsearch_jobs.loc[gsearch_jobs['salary_rate'] == 'a year', ['salary_min','salary_max']] *=1
gsearch_jobs.tail(2)
| index | title | company_name | location | via | description | extensions | job_id | thumbnail | posted_at | ... | commute_time | salary_pay | salary_rate | salary_avg | salary_min | salary_max | salary_hourly | salary_yearly | salary_standardized | description_tokens | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19493 | 1365 | Lead-Data Analyst | EDWARD JONES | Calhoun, MO | via My ArkLaMiss Jobs | At Edward Jones, we help clients achieve their... | ['23 hours ago', '106,916–182,047 a year', 'Fu... | eyJqb2JfdGl0bGUiOiJMZWFkLURhdGEgQW5hbHlzdCIsIm... | NaN | 23 hours ago | ... | NaN | 106916–182047 | a year | 144481.5 | 106916.0 | 182047.0 | NaN | 144481.5 | 144481.5 | [] |
| 19494 | 1366 | Institutional Credit Management - Lending Data... | Citi | United States | via My ArkLaMiss Jobs | The Institutional Credit Management (ICM) grou... | ['24 hours ago', '105,850–158,780 a year', 'Fu... | eyJqb2JfdGl0bGUiOiJJbnN0aXR1dGlvbmFsIENyZWRpdC... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 24 hours ago | ... | NaN | 105850–158780 | a year | 132315.0 | 105850.0 | 158780.0 | NaN | 132315.0 | 132315.0 | ['cognos', 'tableau'] |
2 rows × 26 columns
#selecting only required and useful columns
gsearch_jobs=gsearch_jobs[['index','title','company_name','location','via','description','extensions','job_id','thumbnail',
'posted_at','schedule_type','work_from_home','search_term','date_time','search_location',
'salary_min','salary_max','salary_standardized','description_tokens']]
gsearch_jobs.head(2)
| index | title | company_name | location | via | description | extensions | job_id | thumbnail | posted_at | schedule_type | work_from_home | search_term | date_time | search_location | salary_min | salary_max | salary_standardized | description_tokens | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Data Analyst | Robert Half | Oklahoma City, OK | via LinkedIn | Description\n\nRobert Half is looking for a pr... | ['24 hours ago', 'Contractor', 'Health insuran... | eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 24 hours ago | Contractor | NaN | data analyst | 2023-05-21 07:00:30.990626 | United States | NaN | NaN | NaN | ['go'] |
| 1 | 1 | Data Analyst | Apex Health Solutions | United States | via LinkedIn | Data Analyst Summary Apex Health Solutions is ... | ['21 hours ago', 'Full-time', 'Health insurance'] | eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 21 hours ago | Full-time | NaN | data analyst | 2023-05-21 07:00:30.990626 | United States | NaN | NaN | NaN | ['sql', 'power_bi', 'azure', 'excel'] |
# Calculate average salaries for each schedule_type and create DataFrames
avg_salary_min = pd.DataFrame(gsearch_jobs.groupby(['schedule_type'])['salary_min'].mean()).reset_index()
avg_salary_max = pd.DataFrame(gsearch_jobs.groupby(['schedule_type'])['salary_max'].mean()).reset_index()
avg_salary_std = pd.DataFrame(gsearch_jobs.groupby(['schedule_type'])['salary_standardized'].mean()).reset_index()
# Merge the DataFrames on 'schedule_type'
merged_df = gsearch_jobs.merge(avg_salary_min, on='schedule_type', how='left')
merged_df = merged_df.merge(avg_salary_max, on='schedule_type', how='left')
merged_df = merged_df.merge(avg_salary_std, on='schedule_type', how='left')
# Fill missing values in 'salary_min', 'salary_max', and 'salary_standardized' with respective average values
merged_df['salary_min_x'] = merged_df['salary_min_x'].fillna(merged_df['salary_min_y'])
merged_df['salary_max_x'] = merged_df['salary_max_x'].fillna(merged_df['salary_max_y'])
merged_df['salary_standardized_x'] = merged_df['salary_standardized_x'].fillna(merged_df['salary_standardized_y'])
# Drop the average columns
merged_df = merged_df.drop(['salary_min_y', 'salary_max_y', 'salary_standardized_y'], axis=1)
merged_df.head(3)
| index | title | company_name | location | via | description | extensions | job_id | thumbnail | posted_at | schedule_type | work_from_home | search_term | date_time | search_location | salary_min_x | salary_max_x | salary_standardized_x | description_tokens | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Data Analyst | Robert Half | Oklahoma City, OK | via LinkedIn | Description\n\nRobert Half is looking for a pr... | ['24 hours ago', 'Contractor', 'Health insuran... | eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 24 hours ago | Contractor | NaN | data analyst | 2023-05-21 07:00:30.990626 | United States | 64444.735327 | 119938.245847 | 92496.081529 | ['go'] |
| 1 | 1 | Data Analyst | Apex Health Solutions | United States | via LinkedIn | Data Analyst Summary Apex Health Solutions is ... | ['21 hours ago', 'Full-time', 'Health insurance'] | eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 21 hours ago | Full-time | NaN | data analyst | 2023-05-21 07:00:30.990626 | United States | 84346.076967 | 118330.186535 | 99369.761573 | ['sql', 'power_bi', 'azure', 'excel'] |
| 2 | 2 | Marketing Data Analyst | Ledger Bennett | Anywhere | via LinkedIn | At Ledger Bennett, we strive to help our emplo... | ['21 hours ago', 'Work from home', 'Full-time'] | eyJqb2JfdGl0bGUiOiJNYXJrZXRpbmcgRGF0YSBBbmFseX... | https://encrypted-tbn0.gstatic.com/images?q=tb... | 21 hours ago | Full-time | True | data analyst | 2023-05-21 07:00:30.990626 | United States | 84346.076967 | 118330.186535 | 99369.761573 | ['sql', 'tableau', 'excel'] |
#Rrenaming the salaries column
merged_df.rename(columns={'salary_min_x':'salary_min','salary_max_x':'salary_max','salary_standardized_x':'salary_standardized'},inplace=True)
merged_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19495 entries, 0 to 19494 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 19495 non-null int64 1 title 19495 non-null object 2 company_name 19495 non-null object 3 location 19480 non-null object 4 via 19495 non-null object 5 description 19495 non-null object 6 extensions 19495 non-null object 7 job_id 19495 non-null object 8 thumbnail 9793 non-null object 9 posted_at 19495 non-null object 10 schedule_type 19374 non-null object 11 work_from_home 8743 non-null object 12 search_term 19495 non-null object 13 date_time 19495 non-null object 14 search_location 19495 non-null object 15 salary_min 19377 non-null float64 16 salary_max 19377 non-null float64 17 salary_standardized 19378 non-null float64 18 description_tokens 19495 non-null object dtypes: float64(3), int64(1), object(15) memory usage: 2.8+ MB
from matplotlib import pyplot as plt
plt.figure(figsize=(7.5,8.5))
emp_type=merged_df['schedule_type'].value_counts(dropna=True)
myexplode = [0.05, 0.03, 0.2, 0.4]
plt.pie(emp_type,labels=emp_type.index,autopct='%1.1f%%',explode=myexplode)
plt.axis('equal')
plt.title('Distribution of Schedule Types',fontsize=18,fontweight='bold')
legend_labels = []
for i, count in enumerate(emp_type):
if pd.isna(emp_type.index[i]):
label = 'Blank'
else:
label = emp_type.index[i]
legend_labels.append(f"{label}: {count}")
# Add legends based on the label list
legend_title = 'Employment Types (Count)'
legend=plt.legend(legend_labels, loc='upper right', title=legend_title)
legend.get_title().set_fontweight('bold')
plt.show()
#here we can say that companies mostly hires for full time and after that a big percentage of companies also advertize recruitment on contract basis
#count the number of advertisement posted on job recruting sites
gsearch_via_count=pd.DataFrame(merged_df['via'].value_counts(dropna=True))
gsearch_via_count=gsearch_via_count.sort_values('count',ascending=False)
#For better visualization just consider top 10 most used sites
gsearch_via_count=gsearch_via_count.head(10)
gsearch_via_count.index = gsearch_via_count.index.str.replace('via', '')
gsearch_via_count
| count | |
|---|---|
| via | |
| 6348 | |
| Upwork | 3012 |
| BeBee | 1340 |
| Trabajo.org | 1323 |
| ZipRecruiter | 1218 |
| Indeed | 1047 |
| Adzuna | 404 |
| Snagajob | 366 |
| Monster | 335 |
| My ArkLaMiss Jobs | 270 |
plt.figure(figsize=(12,6))
plt.bar(gsearch_via_count.index,gsearch_via_count['count'])
plt.xlabel('job advertisement via',fontsize=16)
plt.ylabel('number of Job posted',fontsize=16)
plt.title('Top 10 Recruiter Sites',fontsize=18,fontweight='bold')
plt.show
#from here we can see most popular job recruitment sites is linkedln and then upwork and bee
#but linkedln is far away from all other recruitment sites
<function matplotlib.pyplot.show(close=None, block=None)>
merged_df['location'].nunique()
521
merged_df.dtypes
index int64 title object company_name object location object via object description object extensions object job_id object thumbnail object posted_at object schedule_type object work_from_home object search_term object date_time object search_location object salary_min float64 salary_max float64 salary_standardized float64 description_tokens object dtype: object
#changing following columns to float
merged_df[['salary_min','salary_max']]=merged_df[['salary_min','salary_max']].astype('float')
#finding the maximum avg salary by companies
gsearch_jobs_avg_sal=merged_df.groupby('company_name').agg({
'salary_min': 'mean',
'salary_max': 'mean',
'salary_standardized':'mean'
})
#Top 10 companies offering maximum salary_max
top_10_salary_max=gsearch_jobs_avg_sal[['salary_max']]
top_10_salary_max=top_10_salary_max.sort_values('salary_max',ascending=False).head(10)
#plot a horizontal bar graph
from matplotlib import pyplot as plt
plt.figure(figsize=(10,29))
ax=top_10_salary_max.plot.barh(y='salary_max',legend=False)
for index, value in enumerate(top_10_salary_max['salary_max']):
ax.text(value + 1, index, str(value), color='black')
# Set the plot title and labels
ax.set_title('Top Companies offering highest maximum average salary',fontweight='bold')
ax.set_xlabel('salary--------->')
ax.set_ylabel('Company Name--------->')
#here we can see that the Corps team offering higesht maximum salary_max i.e. 428480 and it is far way from others companies
Text(0, 0.5, 'Company Name--------->')
<Figure size 1000x2900 with 0 Axes>
import pandas as pd
import matplotlib.pyplot as plt
# Group the data by schedule_type and calculate the average salary
average_df = merged_df.groupby('schedule_type')[['salary_min', 'salary_max','salary_standardized']].mean()
# Reset the index to make 'company' a regular column
average_df.reset_index(inplace=True)
plt.plot(average_df['schedule_type'], average_df['salary_min'], marker='o', label='Max Salary')
plt.plot(average_df['schedule_type'], average_df['salary_max'], marker='o', label='Min Salary')
plt.plot(average_df['schedule_type'], average_df['salary_standardized'], marker='o', label='Average Salary')
# Customizing the plot
plt.xlabel('Schedule Type')
plt.ylabel('Salary')
plt.title('Salary Distribution by Schedule Type',fontweight='bold')
legend_labels=['salary_min', 'salary_max','salary_standardized']
plt.legend( legend_labels)
# Display the plot
plt.show()
#From here we can conclude that the internship having less than half average salary of full time, part time
# or contractor, and contractor basis job salary is almost equal to full time salary and part time job salary
#so since in part time job is having less working hours and paid amount per month is also good
new_merged_df=merged_df.dropna(subset=['location'])
#remove all location columns which contains United States beacuse we are analysing statewise
new_merged_df=new_merged_df[~new_merged_df['location'].str.contains('United States', case=False)]
#str.split will split it into two string based on comma and str[1] will give the second string and str.strip()
#would remove leading or trailing spaces and it will give state code
new_merged_df['location'] = new_merged_df['location'].str.split(',').str[1].str.strip()
#it will remove the values that contains '('
new_merged_df['location'] = new_merged_df['location'].str.split('(').str[0].str.strip()
#drop those rows which having na in column location
new_merged_df=new_merged_df.dropna(subset=['location'])
#finding the unique statecode
state_code=pd.DataFrame(new_merged_df['location'].unique())
state_code.rename(columns={0:'location'},inplace=True)
pip install geopy
Requirement already satisfied: geopy in c:\users\kumarmohi1\.conda\envs\geo_env\lib\site-packages (2.3.0) Requirement already satisfied: geographiclib<3,>=1.52 in c:\users\kumarmohi1\.conda\envs\geo_env\lib\site-packages (from geopy) (2.0) Note: you may need to restart the kernel to use updated packages.
pip install us
Requirement already satisfied: us in c:\users\kumarmohi1\.conda\envs\geo_env\lib\site-packages (3.1.1) Requirement already satisfied: jellyfish==0.11.2 in c:\users\kumarmohi1\.conda\envs\geo_env\lib\site-packages (from us) (0.11.2) Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
from us import states
# Initialize geocoder
geolocator = Nominatim(user_agent="my-app")
# State code to state name mapping
state_mapping = {state.abbr: state.name for state in states.STATES}
# Function to get latitude and longitude
def get_coordinates(location):
state_name = state_mapping.get(location)
if state_name:
location = geolocator.geocode(state_name + ", USA")
if location:
return location.latitude, location.longitude
return None, None
# Add 'State' column
state_code['State'] = state_code['location'].map(state_mapping)
# Apply the function to the 'location' column
state_code[['Latitude', 'Longitude']] = state_code['location'].apply(get_coordinates).apply(pd.Series)
#merge the state_code table to new_merged_df for refercing the state_name
new_merged_df=new_merged_df.merge(state_code,on='location')
new_merged_df.columns
Index(['index', 'title', 'company_name', 'location', 'via', 'description',
'extensions', 'job_id', 'thumbnail', 'posted_at', 'schedule_type',
'work_from_home', 'search_term', 'date_time', 'search_location',
'salary_min', 'salary_max', 'salary_standardized', 'description_tokens',
'State', 'Latitude', 'Longitude'],
dtype='object')
#creating a dataframe map_merged_df to import necessary column in it to visualize the data on geospatial
map_merged_df=new_merged_df[['location','State','Latitude','Longitude','salary_standardized']]
map_merged_df.dropna(inplace=True)
map_merged_df = map_merged_df.groupby(['State','location', 'Latitude', 'Longitude'])[['salary_standardized']].mean().reset_index()
# Create a new dataframe with the desired columns
map_merged_df = pd.DataFrame({
'location': map_merged_df['location'],
'State':map_merged_df['State'],
'Latitude': map_merged_df['Latitude'],
'Longitude': map_merged_df['Longitude'],
'salary_standardized': map_merged_df['salary_standardized']
})
C:\Users\KumarMohi1\AppData\Local\Temp\ipykernel_21004\561030069.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy map_merged_df.dropna(inplace=True)
map_merged_df
| location | State | Latitude | Longitude | salary_standardized | |
|---|---|---|---|---|---|
| 0 | AR | Arkansas | 35.204888 | -92.447911 | 98265.281749 |
| 1 | CA | California | 36.701463 | -118.755997 | 100007.862664 |
| 2 | KS | Kansas | 38.273120 | -98.582187 | 96844.914292 |
| 3 | MA | Massachusetts | 42.378877 | -72.032366 | 99369.761573 |
| 4 | MO | Missouri | 38.760481 | -92.561787 | 99192.197349 |
| 5 | NE | Nebraska | 41.737023 | -99.587382 | 35796.800000 |
| 6 | OK | Oklahoma | 34.955082 | -97.268406 | 97161.777287 |
| 7 | TX | Texas | 31.263890 | -98.545612 | 97867.240364 |
#using geopandas to converts lat and long to points
import geopandas as gpd
map_merged_df_geo=gpd.GeoDataFrame(map_merged_df,geometry=gpd.points_from_xy(map_merged_df.Longitude,map_merged_df.Latitude))
map_merged_df_geo
| location | State | Latitude | Longitude | salary_standardized | geometry | |
|---|---|---|---|---|---|---|
| 0 | AR | Arkansas | 35.204888 | -92.447911 | 98265.281749 | POINT (-92.44791 35.20489) |
| 1 | CA | California | 36.701463 | -118.755997 | 100007.862664 | POINT (-118.75600 36.70146) |
| 2 | KS | Kansas | 38.273120 | -98.582187 | 96844.914292 | POINT (-98.58219 38.27312) |
| 3 | MA | Massachusetts | 42.378877 | -72.032366 | 99369.761573 | POINT (-72.03237 42.37888) |
| 4 | MO | Missouri | 38.760481 | -92.561787 | 99192.197349 | POINT (-92.56179 38.76048) |
| 5 | NE | Nebraska | 41.737023 | -99.587382 | 35796.800000 | POINT (-99.58738 41.73702) |
| 6 | OK | Oklahoma | 34.955082 | -97.268406 | 97161.777287 | POINT (-97.26841 34.95508) |
| 7 | TX | Texas | 31.263890 | -98.545612 | 97867.240364 | POINT (-98.54561 31.26389) |
pip install plotly
Collecting plotly Using cached plotly-5.14.1-py2.py3-none-any.whl (15.3 MB) Collecting tenacity>=6.2.0 (from plotly) Using cached tenacity-8.2.2-py3-none-any.whl (24 kB) Requirement already satisfied: packaging in c:\users\kumarmohi1\.conda\envs\geo_env\lib\site-packages (from plotly) (23.1) Installing collected packages: tenacity, plotly Successfully installed plotly-5.14.1 tenacity-8.2.2 Note: you may need to restart the kernel to use updated packages.
import geopandas as gpd
#it will show the natural earth map
world_data = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
C:\Users\KumarMohi1\AppData\Local\Temp\ipykernel_21004\1209001884.py:3: FutureWarning: The geopandas.dataset module is deprecated and will be removed in GeoPandas 1.0. You can get the original 'naturalearth_lowres' data from https://www.naturalearthdata.com/downloads/110m-cultural-vectors/.
world_data = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
import plotly.express as px
fig = px.choropleth(
map_merged_df_geo,
locationmode='USA-states',
locations='location',
scope='usa',
color='salary_standardized',
color_continuous_scale='Viridis',
labels={'average_salary': 'Average Salary'},
)
# Add location annotations
for i, row in map_merged_df_geo.iterrows():
fig.add_annotation(
x=row['Longitude'],
y=row['Latitude'],
text=row['location'],
showarrow=False,
font=dict(size=8),
)
fig.update_layout(
title_text='Average Salary by State',
geo=dict(
scope='usa',
projection_type='albers usa',
showlakes=True, # You can set this to False if you don't want to show lakes
lakecolor='rgb(255, 255, 255)',
)
)
fig.show()
#Show from here we can say that CA i.e, California having higest average salary provider for Data Analyst salary in
#among states and Nebraska NE is providing min average salary in among states which less than half the reason behind it
new_merged_df[new_merged_df['location']=='NE']
new_merged_df['salary_standardized'].min()
#So here we can see for NE i.e, Nebraska.we have only one opening so for that company is providing very less average salary
#even it is full time though they are providing less salary even it is min salary is provided by any designation
35796.8
#here spliting the location by comma to get the exact state associated with that
import pandas as pd
# Assuming you have a DataFrame called 'statewise' with a 'location' column
statewise = pd.DataFrame()
statewise['location']=merged_df['location']
# Define a function to extract the second string after a comma
def extract_second_string(value):
if isinstance(value, str):
if ',' in value:
return value.split(',')[1].strip()
return str(value)
# Apply the function to the 'location' column
statewise = pd.DataFrame(statewise['location'].apply(extract_second_string))
# Remove leading/trailing spaces from all columns in the DataFrame
statewise =pd.DataFrame(statewise.applymap(lambda x: x.strip() if isinstance(x, str) else x))
# Display the updated DataFrame
statewise['location'].unique()
array(['OK', 'United States', 'Anywhere', 'AR', 'MO', 'KS', 'Oklahoma',
'Missouri', 'CA', 'Kansas', 'United States (+9 others)',
'United States (+2 others)', 'United States (+13 others)',
'United States (+1 other)', 'KS (+2 others)',
'United States (+7 others)', 'United States (+18 others)',
'United States (+15 others)', 'TX', 'United States (+25 others)',
'United States (+26 others)', 'United States (+11 others)', 'MA',
'United States (+10 others)', 'United States (+25 others)',
'United States (+1 other)', 'NE', 'nan', 'DC'], dtype=object)
import pandas as pd
# Assuming you have two DataFrames: statewise and state_code
# Merge the tables based on the 'location' column
statewise_merge = pd.merge(statewise, state_code, on='location', how='left')
statewise_merge['location'].unique()
array(['OK', 'United States', 'Anywhere', 'AR', 'MO', 'KS', 'Oklahoma',
'Missouri', 'CA', 'Kansas', 'United States (+9 others)',
'United States (+2 others)', 'United States (+13 others)',
'United States (+1 other)', 'KS (+2 others)',
'United States (+7 others)', 'United States (+18 others)',
'United States (+15 others)', 'TX', 'United States (+25 others)',
'United States (+26 others)', 'United States (+11 others)', 'MA',
'United States (+10 others)', 'United States (+25 others)',
'United States (+1 other)', 'NE', 'nan', 'DC'], dtype=object)
#where ever state is na replace those with the location
statewise_merge['State'].fillna(statewise_merge['location'], inplace=True)
#wherever State is na fill that will be shown as Not available
statewise_merge['State']=statewise_merge['State'].replace('nan','Not available')
statewise_merge['State'].unique()
array(['Oklahoma', 'United States', 'Anywhere', 'Arkansas', 'Missouri',
'Kansas', 'California', 'United States (+9 others)',
'United States (+2 others)', 'United States (+13 others)',
'United States (+1 other)', 'KS (+2 others)',
'United States (+7 others)', 'United States (+18 others)',
'United States (+15 others)', 'Texas',
'United States (+25 others)', 'United States (+26 others)',
'United States (+11 others)', 'Massachusetts',
'United States (+10 others)', 'United States (+25 others)',
'United States (+1 other)', 'Nebraska', 'Not available', 'DC'],
dtype=object)
#replace United States (+ occupied string values to United States+ and 'KS (+2 others)' as 'KS')
def modify_country(value):
if "United States" in value:
if "(+" in value:
return value.split("(+")[0].strip() + "+"
else:
return value.strip()+'+'
elif 'KS' in value:
return value.split("(+")[0].strip()
else:
return value
# Apply the function to the 'State' column
statewise_merge['State'] = statewise_merge['State'].apply(modify_country)
statewise_merge['State'].unique()
array(['Oklahoma', 'United States+', 'Anywhere', 'Arkansas', 'Missouri',
'Kansas', 'California', 'KS', 'Texas', 'Massachusetts', 'Nebraska',
'Not available', 'DC'], dtype=object)
#replacing KS location to Kansas
statewise_merge.loc[statewise_merge['State'] == 'KS', 'State'] = 'Kansas'
statewise_merge['State'].unique()
array(['Oklahoma', 'United States+', 'Anywhere', 'Arkansas', 'Missouri',
'Kansas', 'California', 'Texas', 'Massachusetts', 'Nebraska',
'Not available', 'DC'], dtype=object)
import pandas as pd
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.hist(statewise_merge['State'], bins=len(statewise_merge['State'].unique()), edgecolor='black', alpha=0.7)
# Customizations
plt.xlabel('Location', fontweight='bold', fontsize=12)
plt.ylabel('Number of Openings', fontweight='bold', fontsize=12)
plt.title('Distribution of Locations', fontweight='bold', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout() # Adjust spacing to prevent label overlapping
plt.show()
#here we can see the highesh job opening advertized location for anywhere then it was in United State location and
#in states Missouri has highest number of job opening
#Just considering description_tokens column because we will count number of times it's ask for same skills
job_des=pd.DataFrame(merged_df['description_tokens'])
#removing those column which is having '[]' in descrptions
job_des = job_des[~(job_des['description_tokens'] == '[]')]
#droping na values
job_des.dropna(inplace=True)
import pandas as pd
# Split the values in the 'description_tokens' column into separate columns by comma and removing '[]'
job_des['description_tokens'] = job_des['description_tokens'].str.strip("[]").str.replace("'", "").str.split(", ")
# Find the maximum number of columns across all rows
max_columns = job_des['description_tokens'].apply(len).max()
# Create column names for the expanded columns
column_names = [f'description_tokens{i}' for i in range(1, max_columns + 1)]
# Expand the split values into separate columns
job_des = job_des.join(pd.DataFrame(job_des['description_tokens'].to_list(), columns=column_names))
# Drop the original 'description_tokens' column
job_des = job_des.drop('description_tokens', axis=1)
import pandas as pd
import numpy as np
# Assuming you have a DataFrame named 'description_tokens' representing a table
# Iterate over each column
for column in job_des.columns:
# Replace NaN values with a temporary value
job_des[column] = job_des[column].fillna(np.nan)
# Check if column values contain 'sql' and update them, ignoring rows with NaN
job_des.loc[job_des[column].str.contains('sql', na=False), column] = 'sql'
import pandas as pd
import numpy as np
# Assuming your data is stored in a DataFrame named 'job_des' with 21 columns containing job description_tokens(skills) names
# Calculate the total count for each skill across all columns
total_counts = {}
for column in job_des.columns:
counts = job_des[column].str.lower().value_counts(dropna=False).to_dict()
for skill, count in counts.items():
if pd.notnull(skill):
total_counts[skill] = total_counts.get(skill, 0) + count
# Convert the dictionary to a pandas Series for easy manipulation
total_counts_series = pd.Series(total_counts)
# Display the total counts for all skills
print(total_counts_series)
sql 9154
excel 5468
power_bi 4308
r 2894
spark 677
...
c/c++ 21
node.js 8
bitbucket 20
mxnet 1
redis 5
Length: 112, dtype: int64
#converting total_counts_series to DataFrame
total_counts_series=pd.DataFrame(total_counts_series)
#renaming column name
total_counts_series.rename(columns={0:'Count of skills in job description'},inplace=True)
#top 10 demanding skills
total_counts_series = total_counts_series.sort_values('Count of skills in job description',ascending=False)
total_counts_series=total_counts_series.head(10)
job_des.shape
#so here we have 15581 jobs for which data we are performing
(15581, 21)
import matplotlib.pyplot as plt
# Assuming you have defined the 'total_counts_series' variable with the data
total_counts_percentage = (total_counts_series / 15581) * 100
# Define a custom color palette
color_palette = ['#FFC107', '#3F51B5', '#4CAF50', '#F44336', '#9C27B0']
plt.figure(figsize=(10, 6))
total_counts_percentage.plot.bar(legend=False, color=color_palette)
plt.xlabel('Skills', fontweight='bold')
plt.ylabel('Percentage of Job Openings Demanding', fontweight='bold')
plt.title('Percentage of Skills in Job Description', fontweight='bold')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
# Display the chart
plt.show()
#here we can see the most demanding skill for a data analyst is sql around 60 % of companies asking for this skill as
# well as excel is second most demanding skill for same then tableau, python and power_bi is also demanding skills
<Figure size 1000x600 with 0 Axes>
#since in gsearch_via_count already top 10 sites are there so taking head count of 5 for top 5 recruiters
top_5_sites=gsearch_via_count.head(5)
top_5_sites['via']=top_5_sites.index
top_5_sites = top_5_sites.reset_index(drop=True)
C:\Users\KumarMohi1\AppData\Local\Temp\ipykernel_21004\3541795469.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
top_5_sites = top_5_sites.reset_index(drop=True)
sites_merged_df=merged_df
sites_merged_df['via']=sites_merged_df.loc[merged_df['via'].str.contains('via'), 'via'].str.replace('via', '')
merge_sites_top_5 = sites_merged_df.merge(top_5_sites, on='via')
merge_sites_top_5.dropna(subset=['salary_standardized'], inplace=True)
merge_sites_top_5.info()
<class 'pandas.core.frame.DataFrame'> Index: 13156 entries, 0 to 13240 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 13156 non-null int64 1 title 13156 non-null object 2 company_name 13156 non-null object 3 location 13141 non-null object 4 via 13156 non-null object 5 description 13156 non-null object 6 extensions 13156 non-null object 7 job_id 13156 non-null object 8 thumbnail 7525 non-null object 9 posted_at 13156 non-null object 10 schedule_type 13153 non-null object 11 work_from_home 7655 non-null object 12 search_term 13156 non-null object 13 date_time 13156 non-null object 14 search_location 13156 non-null object 15 salary_min 13156 non-null float64 16 salary_max 13156 non-null float64 17 salary_standardized 13156 non-null float64 18 description_tokens 13156 non-null object 19 count 13156 non-null int64 dtypes: float64(3), int64(2), object(15) memory usage: 2.1+ MB
import matplotlib.pyplot as plt
# Group the DataFrame by the 'via' column and calculate the average salary
av_sal = merge_sites_top_5.groupby('via')['salary_standardized'].mean()
# Define custom colors for the bar chart
colors = ['#2196F3', '#FF5722', '#4CAF50', '#FFC107', '#9C27B0']
# Plot the average salary using a bar chart
plt.figure(figsize=(10, 6))
av_sal.plot(kind='bar', color=colors, edgecolor='black')
# Add gridlines
plt.grid(axis='y', linestyle='--')
# Set labels and title
plt.xlabel('Job Sites', fontweight='bold')
plt.ylabel('Average Salary', fontweight='bold')
plt.title('Average Salary by Sites', fontweight='bold')
# Remove the top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# Customize tick labels
plt.xticks(rotation=45, ha='right')
plt.tick_params(axis='x', labelsize=8)
# Display the chart
plt.tight_layout()
plt.show()
#From here we can say job site Upwork provides less average salary compare to othere and apart from that there is
#less differences in average salaries of all job advertising sites
Based on our thorough analysis of the gsearch_jobs dataset, we have gathered some key insights that can help us better understand the job landscape for Data Analysts. Here's a summary of our findings:
When it comes to job types, we noticed that most companies prefer hiring candidates for fulltime positions. However, we also observed a significant number of job openings for contractbased roles.
LinkedIn emerged as the most widely used platform for job recruitment among the sites we analyzed. Upwork and Bee also had a notable presence, although LinkedIn maintained a clear lead.
During our analysis, we identified the Corps team as the top payer, offering an impressive maximum salary of 428,480. This figure sets them apart from other companies in terms of compensation.
Our analysis revealed that internships tend to have an average salary that is less than half of what full-time, part-time, or contractor positions offer. However, contractor roles showed a salary level on par with full-time positions, while part-time positions had a slightly lower average salary. This suggests that part-time jobs can provide decent monthly pay despite fewer working hours.
When examining average salaries across different states, we found that California (CA) offers the highest average salary for Data Analyst positions. On the other hand, Nebraska (NE) provides the lowest average salary, which is less than half of what other states offer.
The United States as a whole has the highest number of job openings. Within the states, Missouri stood out with a significant number of advertised job opportunities.
Based on our analysis, SQL emerged as the most sought-after skill, with approximately 60% of companies emphasizing its importance. Excel ranked second in demand, followed by Tableau, Python, and Power BI.
We observed that Upwork tends to provide a relatively lower average salary compared to other job advertising sites. However, the differences in average salaries among all the platforms were relatively minor.
These insights can serve as a helpful guide for individuals pursuing a career as a Data Analyst, providing valuable information about job trends and skill requirements. It's important to note that these findings are based on the analysis of the gsearch_jobs dataset and should be further explored and validated for more precise interpretations.